library(gridExtra)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## âś” ggplot2 3.4.0      âś” purrr   0.3.5 
## âś” tibble  3.1.8      âś” dplyr   1.0.10
## âś” tidyr   1.2.1      âś” stringr 1.5.0 
## âś” readr   2.1.3      âś” forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::combine() masks gridExtra::combine()
## âś– dplyr::filter()  masks stats::filter()
## âś– dplyr::lag()     masks stats::lag()
library(dplyr)
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(rsample)
library(recommenderlab)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loading required package: arules
## 
## Attaching package: 'arules'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
## 
## Loading required package: proxy
## 
## Attaching package: 'proxy'
## 
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## 
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## 
## The following object is masked from 'package:base':
## 
##     as.matrix
## 
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
library(ggpubr)
library(lsa)
## Loading required package: SnowballC
library(SnowballC)
data(MovieLense)

1. Explorative Datenanalyse (vor Datenreduktion)

mx_user_film <- as(MovieLense, "matrix")  # convert realratingmatrix to normal matrix
df_user_film <- as.data.frame(mx_user_film)   #  convert matrix to dataframe form
df_film_user <- as.data.frame(t(mx_user_film)) # transpose the dataframe: each row is a movie name, each column is a user

1.1 Welches sind die am häufigsten geschauten Genres/Filme?

df_11 <- df_film_user %>% mutate(cnt = rowSums(!is.na(df_film_user))) %>% arrange(desc(cnt)) %>% filter(cnt == max(cnt)) %>% select('cnt')
print(paste("The most frequently watched film is ", rownames(df_11)))
## [1] "The most frequently watched film is  Star Wars (1977)"

1.2 Wie verteilen sich die Kundenratings gesamthaft und nach Genres?

  • firstly let’s have a check the user ratings distribution
df_unlist <- data.frame(rating=unlist(df_film_user))            # unlist the dataframe
ggplot(df_unlist,aes(rating)) + geom_histogram(binwidth=1) +   # die Verteilung der Kundenratings gesamthaft
  labs(x="Ratings", y="Count",title="Distribution of the user ratings") +
  theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 1469760 rows containing non-finite values (`stat_bin()`).

  • The above histogram of ratings distribution is left skewed, with the mode = 4. This means 4 is the most frequently rated score, followed by 3, then 5.
mx_film_genre <- as.data.frame(MovieLenseMeta)
rownames(mx_film_genre) <- mx_film_genre$title
mx_film_genre <- as.matrix(mx_film_genre[,5:22])   # Movie Genre Matrix
mx_user_film[is.na(mx_user_film)] <- 0
mx_user_genre <- mx_user_film %*% mx_film_genre
mx_genre_user <- as.data.frame(t(mx_user_genre))    # a: Stärke Genre Kombination vollständig
mx_genre_user$summe <- rowSums(mx_genre_user)               # new column "summe": summe user ratings of each genre
mx_genre_user <- cbind(genre = rownames(mx_genre_user), mx_genre_user)# new column "genre": genre name copied from rownames
df_12 <- as.data.frame(t(mx_film_genre)) %>% mutate(cnt = rowSums(as.data.frame(t(mx_film_genre))))%>% arrange(desc(cnt))  # add new column: count for each genres
df_12 <- cbind(genres = rownames(df_12),df_12)               # index as a column
rownames(df_12) <- 1:nrow(df_12)                            # generate new index
g1 = ggplot(df_12,aes(x = (reorder(genres,cnt)), y = cnt)) + geom_col() + coord_flip() +
  labs(y="Number of Views", x="Genres",title="Distribution by genres") +
  theme(plot.title = element_text(hjust = 0.5))

g2 = ggplot(mx_genre_user,aes(x=reorder(genre,summe),y=summe)) + geom_col() + coord_flip()+ labs( x="Genre",y= "summed ratings of all users",title="Distribution by genre") +
  theme(plot.title = element_text(hjust = 0.5))
mx_genre_user <- mx_genre_user %>% select(-genre)
grid.arrange(g1, g2, nrow = 1)

  • The left plot shows the distribution of views by genre. The genre drama has the highest number of views, followed by “comedy” and “action”. Fantasy is the least viewed film.
  • The right plot shows the accumulated user ratings by genre. Drama has the highest ratings, followed by comedy and action. This doesn’t mean that the three movie genres have the best average ratings, cause we have to include the information that they are the most viewed films. Documentary has the least accumulated ratings.

1.3 & 1.4 Wie verteilen sich die mittleren Kundenratings pro Film? Wie stark streuen die Ratings von individuellen Kunden?

df_avg_rating_film <- df_film_user %>% mutate(avg_rating = rowMeans(df_film_user,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
g1 = ggplot(df_avg_rating_film,aes(avg_rating)) + geom_histogram(bins = 5,fill="black", col="grey") +                
  # die Verteilung
  labs(x="average ratings", y="Count of films",title="Distribution of avg-ratings per film") +
  theme(plot.title = element_text(hjust = 0.5))
df_avg_rating_user <- df_user_film %>% mutate(avg_rating = rowMeans(df_user_film,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
g2 = ggplot(df_avg_rating_user,aes(avg_rating)) + geom_histogram(bins = 5,fill="black", col="grey") +                
  # die Verteilung
  labs(x="average ratings", y="Count of users",title="Distribution of avg-ratings per user")+
  theme(plot.title = element_text(hjust = 0.5))
grid.arrange(g1, g2, nrow = 1)

  • left plot: Here I calculated the average rating of each film and counted them. The most frequent average ratings are 3 and 4. Together, at least 75% of the movies have average ratings of 3 or 4. Very few films become an average rating of 5. Reasons could be either the films are only watched and rated by a small group and this group of users really love them, or the films are simply good made that all the viewers have given the best ratings.

  • right plot: here I calculated the average ratings by user to check the user rating habits. The most of the users have the average ratings around 3.3 and 4.2. The plot shows us that the users have very differently rating preferences.

1.5 Welchen Einfluss hat die Normierung der Ratings pro Kunde auf deren Verteilung?

  • We know that each user have different rating preferences. Some may give between 3 and 5, some between 2 and 4, or some use larger range between 1 and 5… With this influence, it is hard to define what is a good rating, because for one user 4 may mean a very good film, but for another user 4 is a standard normal film. Therefore we could not recommend the films the users really like.
dev.new(width=8, height=2, unit="in")
df_avg_rating_user <- df_user_film %>% mutate(avg_rating = rowMeans(df_user_film,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
g1 = ggplot(df_avg_rating_user,aes(avg_rating)) + geom_histogram(bins = 5,fill="black", col="grey") +                
  # die Verteilung
  labs(x="average ratings per user", y="Count of users",title="Non-normalized ratings per user")+
  theme(plot.title = element_text(hjust = 0.5))
normalized_movielens <- as(normalize(MovieLense,method = "z-score"), "matrix")
normalized_movielens <- as.data.frame(normalized_movielens)
normalized_avg_rating_user <- normalized_movielens %>% mutate(avg_rating=rowMeans(normalized_movielens,na.rm=TRUE, dims=1)) %>% select('avg_rating')
g2 = ggplot(normalized_avg_rating_user,aes(avg_rating)) + geom_histogram(bins =  5,fill="black", col="grey") +                # die Verteilung
  labs(x="average normalized ratings per user", y="Count of users",title="z-score Normalized ratings per user") +
  theme(plot.title = element_text(hjust = 0.5))
grid.arrange(g1, g2, nrow = 1)
  • without normalization (left plot): the average ratings per user are slightly right skewed with mode of 3.3. The values vary a lot across different users.
  • with Z-score normalization (right plot): the average ratings per user are all around 0 (± 6e-07 could be ignored). This means, the ratings from different users are normalized to the same scale, with the mean of 0 and standard deviation of 1. This will reduce the influence of rating habits of different users.

1.6 Welche strukturellen Charakteristika (z.B. Sparsity) und Auffälligkeiten zeigt die User-Item Matrix?

  • The User-Item Matrix of MovieLense dataset contains a large numbers of users(rows) and items (columns), but a single user interacts with only a small subset of the items. This means, the dataframe contains many zero values or NA, the structure is extremely sparse. If simply using the sparse dataframe, it will waste lots of time on the NA.
  • Therefore it will make sense to reduce the users who have very few ratings, and the films which get very few ratings.

2 Datenreduktion:

  • Reduziere den MovieLense Datensatz auf rund 400 Kunden und 700 Filme, indem du Filme und Kunden mit sehr wenigen Ratings entfernst.

2.1 Anzahl Filme und Kunden sowie Sparsity vor und nach Datenreduktion

  • reduce data to 700 most rated movies and 400 users with most ratings
df_reduced <- df_film_user %>% mutate(n_pro_film = rowSums(!is.na(df_film_user))) %>% arrange(desc(n_pro_film)) %>% slice(0:700)%>% select(-n_pro_film)       # reduce to 700 movies
df_reduced <- as.data.frame(t(df_reduced))
df_reduced <- df_reduced %>% mutate(n_pro_user = rowSums(!is.na(df_reduced))) %>% arrange(desc(n_pro_user)) %>% slice(0:400) %>% select(-n_pro_user)  # reduce to 400 users
print(dim(df_user_film)); print(dim(df_reduced))
## [1]  943 1664
## [1] 400 700
print(paste0('Before data reduction, NA data proportion is ',sum(is.na(df_user_film))/(1664*943)))
## [1] "Before data reduction, NA data proportion is 0.936658781303532"
print(paste0('After data reduction, NA data proportion is ',sum(is.na(df_reduced))/(700*400)))
## [1] "After data reduction, NA data proportion is 0.758992857142857"
  • before reduction: 1664 films, 943 users, 93.67% data are NA
  • after reduction: 700 films, 400 users, 75.90% data are NA
  • after data reduction, the data sparsity is largely reduced.
df_film_cnt_vor_red <- df_film_user %>% mutate(cnt = rowSums(!is.na(df_film_user)),id=1:1664) %>% select(cnt,id)%>% arrange(desc(cnt))
df_film_cnt_nach_red <- as.data.frame(t(df_reduced)) %>% mutate(cnt = rowSums(!is.na(as.data.frame(t(df_reduced)))),id=1:700) %>% select(cnt,id) %>% arrange(desc(cnt))

df_user_cnt_vor_red <- df_user_film %>% mutate(cnt = rowSums(!is.na(df_user_film)),id=1:943) %>% select(cnt,id)%>% arrange(desc(cnt))
df_user_cnt_nach_red <- df_reduced %>% mutate(cnt = rowSums(!is.na(df_reduced)),id=1:400) %>% select(cnt,id) %>% arrange(desc(cnt))

g1 = ggplot(df_film_cnt_vor_red, aes(reorder(id,-cnt),cnt))+geom_col(fill="black", col="black")+
  labs(x = "Film", y = "number of ratings", title = "film popularity before reduction") +
  theme(plot.title = element_text(hjust = 0.5),axis.ticks.x=element_blank(),axis.text.x = element_blank())


g2 = ggplot(df_film_cnt_nach_red, aes(reorder(id,-cnt),cnt))+geom_col(fill="black", col="black")+
  labs(x = "Film", y = "number of ratings", title = "film popularity after reduction") +
  theme(plot.title = element_text(hjust = 0.5),axis.ticks.x=element_blank(),axis.text.x = element_blank())

g3 = ggplot(df_user_cnt_vor_red, aes(reorder(id,-cnt),cnt))+geom_col(fill="black", col="black")+
  labs(x = "User", y = "number of ratings", title = "User rating frequency before reduction") +
  theme(plot.title = element_text(hjust = 0.5),axis.ticks.x=element_blank(),axis.text.x = element_blank())

g4 = ggplot(df_film_cnt_nach_red, aes(reorder(id,-cnt),cnt))+geom_col(fill="black", col="black")+
  labs(x = "User", y = "number of ratings", title = "User rating frequency after reduction") +
  theme(plot.title = element_text(hjust = 0.5),axis.ticks.x=element_blank(),axis.text.x = element_blank())

grid.arrange(g1,g2,g3,g4,nrow=2,ncol=2)

  • left two plots use the data before reduction. We could see that both plots have long tail.
  • Right side two plots use the data after reduction. The tails in both plots are shorter than left two plots. This means the users and films with too little ratings are eliminated.
par(mfrow=c(1,2))
image(as(df_user_film,"matrix"), main = "sparsity before reduction")
image(as(df_reduced,"matrix"),main = "sparsity after reduction")

  • The two images above showed us the data sparsity before (left image) and after (right image) reduction. The blank pixels represent the NA, the color pixels represent the available values. By comparing the two images, we could see that the right image has less blank and more color pixels than the left one. This means the data after reduction is less sparse than before reduction. Data reduction has successfully reduced the data sparsity.

2.2 mittlere Kundenratings pro Film vor und nach Datenreduktion.

  • Before data reduction
par(mfrow=c(1,2))
df_avg_rating <- df_film_user %>% mutate(avg_rating = rowMeans(df_film_user,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
g1=ggplot(df_avg_rating,aes(avg_rating)) + geom_histogram(bins = 5,fill="black", col="grey") +                # die Verteilung
  labs(x="average ratings per film", y="Count",title="Before reduction") +
  theme(plot.title = element_text(hjust = 0.5))
df_reduced_t <- as.data.frame(t(df_reduced))
df_reduced_avg_rating <- df_reduced_t%>% mutate(avg_rating = rowMeans(df_reduced_t,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
g2=ggplot(df_reduced_avg_rating,aes(avg_rating)) + geom_histogram(bins = 5,fill="black", col="grey") +                # die Verteilung
  labs(x="average ratings per film", y="Count",title="After reduction") +
  theme(plot.title = element_text(hjust = 0.5))
grid.arrange(g1,g2,nrow=1)

  • Before data reduction, the average ratings per film have a large range between 1 and 5
  • After the data reduction, the range of average ratings is from 1.5 to 4.6, smaller than before reduction.
  • This means, for example some films with only few extreme low ratings or extreme high ratings are excluded.

3 Analyse Ähnlichkeitsmatrix:

  • Erzeuge einen IBCF Recommender und analysiere die Ă„hnlichkeitsmatrix des trainierten Modelles fĂĽr den reduzierten Datensatz.

3.1 Zerlege den reduzierten MovieLense Datensatz in ein disjunkte Trainings- und Testdatenset im Verhältnis 4:1

set.seed(1965)
mx_reduced <- as.matrix(df_reduced)
rrm_reduced <- as(mx_reduced,"realRatingMatrix")
train_test <- evaluationScheme(rrm_reduced, method="split", train=0.8, k=1, given=15, goodRating=4)
# training data 80% of the users
rrm_reduced_train <- getData(train_test,"train")
# test data is 20% of the all users, the test data is splited into two parts: known test data and unknown test data
# the known portion returns specified 50 film ratings per test user which will be used to predict ratings or films for the test users
rrm_reduced_known <- getData(train_test,"known")
# the rest of film ratings of the test users are in unknown portion, which will be used to evaluate the prediction
rrm_reduced_unknown <- getData(train_test,"unknown")
dim(rrm_reduced_train);dim(rrm_reduced_known);dim(rrm_reduced_unknown)
## [1] 320 700
## [1]  80 700
## [1]  80 700
  • After split, there are 320 rows (users) in the trainset, 80 rows in test set (known and unknown).
# understand known and unknown test data
sum(!is.na(t(as.data.frame(as(rrm_reduced_known,'matrix'))[2,]))) # second user, known data
## [1] 15
sum(!is.na(t(as.data.frame(as(rrm_reduced_unknown,'matrix'))[2,]))) # second user, unknown data
## [1] 379
sum(!is.na(t(as.data.frame(as(rrm_reduced_known,'matrix'))[11,]))) # 11th user, known data
## [1] 15
sum(!is.na(t(as.data.frame(as(rrm_reduced_unknown,'matrix'))[11,]))) # 11th user, unknown data
## [1] 223
  • For example, the second user in test set has 15 film ratings in known set, 379 film ratings in unknown set. The intersection of known and unknow set is null.

  • The 11th user in test set has 15 film ratings in known set, 223 film ratings in unknown set.

3.2 Trainiere ein IBCF Modell mit 30 Nachbarn und Cosine Similarity

model_IBCF <- Recommender(data = rrm_reduced_train,method="IBCF",parameter=list(normalize = "Z-score",method="Cosine",k=30))

3.3 Bestimme die Verteilung der Filme,

  • welche bei IBCF fĂĽr paarweise Ă„hnlichkeitsvergleiche verwendet werden.

  • Determine the distribution of films used in IBCF for pairwise similarity comparisons

# Here only exhibit the first 50 rows and columns
get_model_IBCF <- getModel(model_IBCF)
par(mfrow=c(1,2))
g1 = image(get_model_IBCF$sim, main = "Similarity matrix")
g2 = image(get_model_IBCF$sim[1:50, 1:50], main = "Similarity matrix of first 50 rows and cols")
grid.arrange(g1,g2,nrow=1,ncol=2)

  • In the similarity matrix, each row represents a film, each column is also a film. But the matrix is not symmetric. For example when use 30 neighbors, each row should have 30 elements larger than 0. In each column, the number of elements greater than 0 indicates how many times this film was included in the TOP list of other films.
  • In the left similarity matrix, it shows all 700 films similarity. Each row has 30 dark pixels. The columns at right sides of the matrix have more dark pixels than left sides, this means, the films at the right side column are more frequently included in the TOP-N list of other films.
  • The right similarity matrix displays only the first 50 rows and 50 columns of the left matrix.
IBCF_sim <- as.data.frame(colSums(get_model_IBCF$sim > 0))
colnames(IBCF_sim) <- "recommended_frequency" # frequency that the corresponding film is included in other films' TOP-N lists
ggplot(IBCF_sim, aes(x=IBCF_sim$recommended_frequency))+geom_histogram(fill="black", col="grey",binwidth = 5)+
  labs(x = "Recommended frequency", y = "Count", title = "Distribution of recommended frequency per film") +
  theme(plot.title = element_text(hjust = 0.5))

  • The plot shows the distribution of recommended frequency of films. For instance, about 52 films have the recommended frequency of 0, this means they are not included in any TOP recommending lists of other films. About 100 films are included in the TOP lists of 5 films. The highest frequency is about 160, but only a small amount of films have so high recommended frequency. . #### 3.4 Bestimme die Filme,

  • die am häufigsten in der Cosine-Ă„hnlichkeitsmatrix auftauchen und analysiere deren Vorkommen und Ratings im reduzierten Datensatz.

  • die am häufigsten Film in der Cosine-Ă„hnlichkeitsmatrix

# die 10 am häufigsten Filme in der Cosine Ähnlichkeitsmatrix, i.e. die Filme mit der höheste sum Ähnlichkeits
high_freq_film <- IBCF_sim %>% mutate(film = rownames(IBCF_sim)) %>% arrange(desc(recommended_frequency)) %>% slice(0:10)  %>% select(film,recommended_frequency)
rownames(high_freq_film) <- 1:10
t <- df_reduced_t %>% mutate(is_NA = rowSums(is.na(df_reduced_t)),not_NA = rowSums(!is.na(df_reduced_t)),occurrence = rowSums(!is.na(df_reduced_t))/dim(df_reduced_t)[2], film = rownames(df_reduced_t)) %>% select(is_NA,not_NA,occurrence,film)
Occurrence <- left_join(high_freq_film,t,by = "film") %>% select(film,recommended_frequency,occurrence)
t2 <- df_reduced_t %>% mutate(film = rownames(df_reduced_t), avg_rating = rowMeans(df_reduced_t,na.rm=TRUE))%>% select(film,avg_rating)
high_freq_film <- left_join(Occurrence,t2,by = "film")
rownames(high_freq_film) <- 1:10
high_freq_film  # occurrence: not_NA / user number, the user ratio that rated this film
g1 = ggplot(high_freq_film, aes(x= reorder(film,recommended_frequency), y=recommended_frequency)) + geom_bar(stat="identity")+coord_flip() +
  labs(y = "Recommended frequency", x = "Films")

g2 = ggplot(high_freq_film, aes(x= occurrence, y=reorder(film,recommended_frequency))) + geom_bar(stat="identity")+
  labs(x = "occurrence", y = "Films")

g3 = ggplot(high_freq_film, aes(x= avg_rating, y=reorder(film,recommended_frequency))) + geom_bar(stat="identity")+
  labs(x = "average ratings", y = "Films") 
ggarrange(g1, g2, g3,labels = c("A", "B", "C"),ncol = 1, nrow = 3)

  • The three plots present frequency, occurrencem average ratings of the ten most often recommended films, sorted by frequency.

  • recommended_frequency: the frequency that this item appears in the top-n recommendation list of other users.

    • The Mouse Hunt is the most frequently recommended film.
  • occurrence: frequency that the film is rated divided by number of all users

  • avg_rating: the average rating of each film

  • From the result, we could see that, the most often recommended film Mouse Hunt has a relatively low average rating 2.32, and medium occurrence. This means there are other factors influence the recommend results.

4 Implementierung Ähnlichkeitsmatrix:

  • Implementiere eine Funktion zur effizienten Berechnung von sparsen Ă„hnlichkeitsmatrizen fĂĽr IBCF RS und analysiere die Resultate fĂĽr 100 zufällig gewählte Filme.

4.1 Implementiere eine Funktion,

  • um (a) fĂĽr ordinale Ratings effizient die Cosine Similarity und (b) fĂĽr binäre Ratings effizient die Jaccard Similarity zu berechnen #### 4.1a Implementiere eine Funktion,

  • um fĂĽr ordinale Ratings effizient die Cosine Similarity zu berechnen,

  • write a function cos_similarity to calculate the film cosine similarity with ordinal ratings

cos_similarity <- function(mx){
    n <- dim(mx)[2]
    mx_0 <- mx
    mx_0[is.na(mx_0)] <- 0
    sim_mx <- matrix(1:n^2, nrow = n)
    for(i in 1:n){
      for(j in 1:n){
        numerator <- t(mx_0[,i]) %*% mx_0[,j]
        denominator <- sqrt(sum(mx_0[,i]^2))*sqrt(sum(mx_0[,j]^2))
        sim_mx[i,j] <- numerator/denominator
      }
    }
    rownames(sim_mx) <- colnames(mx)
    colnames(sim_mx) <- colnames(mx)
    return(sim_mx)
}

# randomly select 100 films
set.seed(1) 
df_100 <- df_reduced[,sample(1:ncol(df_reduced),100)] 
cos_sim_reduced_1 <- cos_similarity(as(df_100,'matrix'))
as.data.frame(cos_sim_reduced_1) 
rownames(cos_sim_reduced_1)<-NULL
colnames(cos_sim_reduced_1) <- NULL
heatmap(cos_sim_reduced_1,Colv = NA, Rowv = NA,main = "cosine similarity by my function") 

  • the similarity matrix showed the cosine similarity between every two users (of the 100 randomly selected films), darker colors represent higher cosine similarity between two films. Light colors mean less cosine similarity between the two films.

4.1b Implementiere eine Funktion,

  • um fĂĽr binäre Ratings effizient die Jaccard Similarity zu berechnen

  • write a function to calculate the jaccard similarity of binary ratings

Jacc_similarity <- function(mx){
    mx_bi <- mx
    mx_bi[mx_bi <= 3] <- 0
    mx_bi[is.na(mx_bi)] <- 0 # the NA and ratings <= 3 all converted as 0
    mx_bi[mx_bi > 3] <- 1 # the ratings > 3 (which shows a preference) converted as 1
    n <- dim(mx_bi)[2]
    sim_mx <- matrix(1:n^2, nrow = n) # create a matrix with dimention of n x n for similarity
    for(i in 1:n){
      for(j in 1:n){
        list1 = mx_bi[,i]
        list2 = mx_bi[,j]
        uni = 0 # union of film i and j
        ins = 0 # intersection of film i and j
        for(m in 1:dim(mx_bi)[1]){
          if (list1[m] == list2[m] & (list1[m]==0|list1[m]==1)){
            ins = ins + 1
            uni = uni + 1}
          if (list1[m] != list2[m] & !is.na(list1[m]) & !is.na(list2[m])){
            uni = uni + 1}
        }
        sim_mx[i,j] <- ins/uni # intersection/union
      }}
    rownames(sim_mx) <- colnames(mx)
    colnames(sim_mx) <- colnames(mx)
    return(sim_mx)
  }
Jacc_sim_reduced <- Jacc_similarity(as(df_100,'matrix'))  # a 700 x 700 similarity matrix
as.data.frame(Jacc_sim_reduced) 
rownames(Jacc_sim_reduced)<-NULL
colnames(Jacc_sim_reduced) <- NULL
heatmap(Jacc_sim_reduced,Colv = NA, Rowv = NA,main = "Jaccard similarity by my function") 

  • This similarity matrix has darker colors. This means, the jaccard similarity on binary data has higher values than cosine similarity on ordina data.

4.2 Vergleiche deine Implementierung

  • Vergleiche deine Implementierung der Cosine-basierten Ă„hnlichkeitsmatrix fĂĽr ordinale Kundenratings mit der korrespondierenden via Open Source Paketen erzeugten Ă„hnlichkeitsmatrix,
mx_100_0 <- as(df_100,'matrix')
mx_100_0[is.na(mx_100_0)]<-0  # replace NA as 0
# calculate the cosine similarity matrix by open source package lsa
cos_sim_reduced_2 <- cosine(mx_100_0)
as.data.frame(cos_sim_reduced_2) # display a small part of the result
rownames(cos_sim_reduced_2)<-NULL
colnames(cos_sim_reduced_2) <- NULL
heatmap(cos_sim_reduced_2,Colv = NA, Rowv = NA,main = "cosine similarity by library lsa")

# compare the results by my function and the function from lsa library
compare_two_cos_sim_methods <- all.equal(cos_sim_reduced_1, cos_sim_reduced_2, tolerance = 1e-10,check.attributes = FALSE)
compare_two_cos_sim_methods
## [1] TRUE
  • The cosine similarity matrices calculated by two methods are equal (with tolerance of 1e-10). This means my function works correctly.

4.3 Vergleiche und diskutiere die Unterschiede

  • Vergleiche und diskutiere die Unterschiede deiner mittels Cosine Similarity erzeugten Ă„hnlichkeitsmatrizen fĂĽr ordinale und normierte Kundenratings mit der Jaccard-basierten Ă„hnlichkeitsmatrix.
cos_unlist <- data.frame(cos_sim=unlist(as.data.frame(cos_sim_reduced_1)))            # unlist the dataframe
g1 = ggplot(cos_unlist,aes(cos_sim)) + geom_histogram(bins=50) +   # die Verteilung der Kundenratings gesamthaft
  labs(x="cosine similarity", y="Count",title="Distribution of cosine similarities") +
  theme(plot.title = element_text(hjust = 0.5))
jac_unlist <- data.frame(jac_sim=unlist(as.data.frame(Jacc_sim_reduced)))            # unlist the dataframe
g2 = ggplot(jac_unlist,aes(jac_sim)) + geom_histogram(bins=50) +   # die Verteilung der Kundenratings gesamthaft
  labs(x="jaccard similarity", y="Count",title="Distribution of jaccard similarities") +
  theme(plot.title = element_text(hjust = 0.5))
grid.arrange(g1,g2,nrow=1,ncol=2)

compare_cos_Jacc <- all.equal(cos_sim_reduced_1,Jacc_sim_reduced,tolerance = 1e-3,check.attributes = FALSE)
print(paste0("differences between cosine and jaccard similarity is: ",compare_cos_Jacc))
## [1] "differences between cosine and jaccard similarity is: Mean relative difference: 2.173"
print(paste0("mean of cosine similarities is: ", mean(cos_unlist$cos_sim))) 
## [1] "mean of cosine similarities is: 0.252113218211702"
print(paste0("mean of jaccard similarities is: ", mean(jac_unlist$jac_sim))) 
## [1] "mean of jaccard similarities is: 0.7840795"
  • The mean relative difference between cosine similarity and jaccard similarity is 2.173.
  • cosine similarity distribution is right skewed with mode of 0.2 mean of 0.25. Where jaccard similarity distribution is left skewed with mode of 0.83 mean of 0.78.
  • average Jaccard on binary data is much higher than average cosine similarity on ordinal data.
  • The two methods work differently. Jaccard similarity takes the intersection ratings set of two items and then divide by the union. The cosine similarity takes the total length of the vectors.

5 Analyse Top-N Listen IBCF vs UBCF

  • Vergleiche und diskutiere Top N Empfehlungen von IBCF und UBCF Modellen mit 30 Nachbarn und Cosine Similarity fĂĽr den reduzierten Datensatz.

5.1 Berechne Top 15 Empfehlungen fĂĽr Testkunden mit IBCF und UBCF

## top-N recommendations for testdata users with IBCF
Pred_IBCF <- predict(object = model_IBCF, newdata = rrm_reduced_known, n = 15,type=c("topNList"))
TOP15_IBCF <- sapply(Pred_IBCF@items, function(x) {colnames(df_reduced)[x]})
colnames(TOP15_IBCF) <- rownames(as(rrm_reduced_known,"matrix"))
as.data.frame(TOP15_IBCF)  # each column represents the top 15 recommendations for one user, column names are UserID
# predict with UBCF
model_UBCF <- Recommender(rrm_reduced_train,method="UBCF",param=list(normalize = "Z-score",method="Cosine",nn=30)) #model
# top-N recommendations for testdata users with UBCF
Pred_UBCF <- predict(object = model_UBCF, newdata = rrm_reduced_known, n = 15,type=c("topNList"))
TOP15_UBCF <- sapply(Pred_UBCF@items, function(x) {colnames(df_reduced)[x]})
colnames(TOP15_UBCF) <- rownames(as(rrm_reduced_known,"matrix"))
as.data.frame(TOP15_UBCF) # each column represents the top 15 recommendations for one test user
  • From the result above for first two users, we could see that the top 15 recommendations for the same user between the IBCF and UBCF models are completely different. This means IBCF and UBCF give different recommendations.
  • Using same model, Top-15 recommendations for different users by same model are completely different. This means each user get personalized recommendations.

5.2 Vergleiche die Top 15 Empfehlungen und deren Verteilung

  • Vergleiche die Top 15 Empfehlungen und deren Verteilung und diskutiere Gemeinsamkeiten und Unterschiede zwischen IBCF und UBCF fĂĽr alle Testkunden.

  • IBCF and UBCF models give completely different Top-15 recommendations for the same user.

  • First identify the most recommended movies in the TOP-15 list from all test users.

# generate frequency tables : all the recommendation films with the corresponding frequencies
film_freq_IBCF <- as.data.frame(table(as.factor(TOP15_IBCF)))
colnames(film_freq_IBCF) <- c("Film_by_IBCF", "Frequency")
film_freq_UBCF <- as.data.frame(table(as.factor(TOP15_UBCF)))
colnames(film_freq_UBCF) <- c("Film_by_UBCF", "Frequency")
film_freq_IBCF %>% arrange(desc(Frequency))
film_freq_UBCF %>% arrange(desc(Frequency))
g1 = ggplot(head(film_freq_IBCF %>% arrange(desc(Frequency)),15),aes(x = reorder(Film_by_IBCF,Frequency), y = Frequency)) + geom_col()  + coord_flip() +
  labs(y="Frequency", x="Films",title="IBCF Top-15 films for all users") +
  theme(plot.title = element_text(hjust = 0.5))

g2 = ggplot(head(film_freq_UBCF %>% arrange(desc(Frequency)),15),aes(x = reorder(Film_by_UBCF,Frequency), y = Frequency)) + geom_col() + coord_flip() +
  labs(y="Frequency", x="Films",title="UBCF Top-15 films for all users") +
  theme(plot.title = element_text(hjust = 0.5))
grid.arrange(g1,g2,nrow=2,ncol=1)

  • The two models have again completely different Top frequently recommended films.
  • The dataset used here includes 400 users. The maximum recommended frequency with IBCF and UBCF are 26 and 28 respectively. This means, one film is maximum recommended to 28 users. So we could understand it as, the models try to recommend different films to each user. Users get personalized recommendations.

6 Analyse Top-N Listen Ratings

  • Untersuche den Einfluss von Ratings (ordinale vs binäre Ratings) und Modelltyp (IBCF vs UBCF) auf Top-N Empfehlungen fĂĽr den reduzierten Datensatz. Vergleiche den Anteil ĂĽbereinstimmender Empfehlungen der Top 15 Liste fĂĽr

6.1 IBCF vs UBCF, beide mit ordinalem Rating und Cosine Similarity fĂĽr alle Testkunden

# the test user "unknown" ratings
mx_reduced_unknown <- as(rrm_reduced_unknown,"matrix")
# evaluate recommendations on "unknown" ratings
acc_IB <- calcPredictionAccuracy(predict(object = model_IBCF, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=15,goodRating=4)
acc_UB <- calcPredictionAccuracy(predict(object = model_UBCF, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=15,goodRating=4)
acc_ordinal <- rbind(acc_IB,acc_UB)
rownames(acc_ordinal) <- c("IBCF ordinal","UBCF ordinal")
acc_ordinal <- as.data.frame(acc_ordinal) %>% mutate(f1_score = 2*precision*recall/(precision + recall))
as(acc_ordinal,'matrix')
##                  TP      FP      FN       TN   N precision     recall
## IBCF ordinal 6.1625  8.8375 76.4000 593.6000 685 0.4108333 0.08378385
## UBCF ordinal 1.8750 13.1250 80.6875 589.3125 685 0.1250000 0.02345749
##                     TPR        FPR   f1_score
## IBCF ordinal 0.08378385 0.01446599 0.13918319
## UBCF ordinal 0.02345749 0.02181942 0.03950203
  • The metrics precision reflects the fraction of top-N recommended items that are relevant to the user.
    • precision = TP/(TP+FP)
  • recall is a fraction of top k recommended items that are in a set of items relevant to the user.
    • recall = TP/(TP+FN)
  • f1_score is a harmonic accuracy calculated from precision and recall.
    • f1 = 2 * (Precision * Recall) / (Precision + Recall)
  • With ordinal ratings and cosine similarity, IBCF has better f1-score than UBCF.

6.2 IBCF vs UBCF, beide mit binärem Rating und Jaccard Similarity für alle Testkunden

# convert the reduced dataset to binary: ratings > 3 converted as 1 (like), ratings <= 3 converted as 0 (not like)
df_reduced_bi <- df_reduced
df_reduced_bi[df_reduced_bi <= 2] <- 0
df_reduced_bi[df_reduced_bi > 2] <- 1
set.seed(468)
mx_reduced_bi <- as.matrix(df_reduced_bi)
rrm_reduced_bi <- as(mx_reduced_bi,"realRatingMatrix")
train_test_bi <- evaluationScheme(rrm_reduced_bi, method="split", train=0.8, k=1, given=15)
# training data 80% of the users
rrm_reduced_train_bi <- getData(train_test_bi,"train")
# test data is 20% of the all users, the test data is splited into two parts: known test data and unknown test data
# the known portion returns specified 20 items per test user is used to predict ratings or films for the test users
rrm_reduced_known_bi <- getData(train_test_bi,"known")
# the unknown portion is used to compute the prediction error of the model
rrm_reduced_unknown_bi <- getData(train_test_bi,"unknown")
# train the IBCF or UBCF model on training dataset
model_IBCF_bi <- Recommender(data = rrm_reduced_train_bi,method="IBCF",parameter=list(normalize = "Z-score",method="Jaccard",k=30))
model_UBCF_bi <- Recommender(data = rrm_reduced_train_bi,method="UBCF",parameter=list(normalize = "Z-score",method="Jaccard",nn=30))
# predict the ratings of test users by IBCF and UBCF
pred_rating_IBCF_bi <- as(predict(object = model_IBCF_bi, newdata = rrm_reduced_known_bi, n = 15,type="ratings"),"matrix")
pred_rating_UBCF_bi <- as(predict(object = model_UBCF_bi, newdata = rrm_reduced_known_bi, n = 15,type="ratings"),"matrix")
# the test user "unknown" ratings
mx_reduced_unknown_bi <- as(rrm_reduced_unknown_bi,"matrix")
# evaluate recommendations on "unknown" ratings
acc_IB_bi <- calcPredictionAccuracy(predict(object = model_IBCF_bi, newdata = rrm_reduced_known_bi, n = 15,type="topNList"),rrm_reduced_unknown_bi,given=15,goodRating=1)
acc_UB_bi <- calcPredictionAccuracy(predict(object = model_UBCF_bi, newdata = rrm_reduced_known_bi, n = 15,type="topNList"),rrm_reduced_unknown_bi,given=15,goodRating=1)
acc_bi <- rbind(acc_IB_bi,acc_UB_bi)
rownames(acc_bi) <- c("IBCF binary","UBCF binary")
acc_bi <- as.data.frame(acc_bi) %>% mutate(f1_score = 2*precision*recall/(precision + recall))%>% select(-c(TP,FP,FN,TN,N,TPR,FPR))
as(acc_bi,'matrix')
##             precision     recall   f1_score
## IBCF binary 0.4031746 0.01270659 0.02463672
## UBCF binary 0.1916667 0.02441571 0.04331382
  • UBCF with binary ratings and jaccard similarity has better f1-score.

6.3 UBCF mit ordinalem (Cosine Similarity) vs UBCF mit binärem Rating (Jaccard Similarity) für alle Testkunden.

acc_ubcf <- as(rbind(acc_UB,acc_UB_bi),'matrix')
acc_ubcf <- as.data.frame(acc_ubcf) %>% mutate(f1_score = 2*precision*recall/(precision + recall))%>% select(-c(TP,FP,FN,TN,N,TPR,FPR))
rownames(acc_ubcf) <- c("UBCF ordinal cosine","UBCF binary jaccard")
as(acc_ubcf,'matrix')
##                     precision     recall   f1_score
## UBCF ordinal cosine 0.1250000 0.02345749 0.03950203
## UBCF binary jaccard 0.1916667 0.02441571 0.04331382
  • UBCF with binary data and jaccard similarity has higher f1-score.

7 Analyse Top-N Listen -IBCF vs SVD

  • Aufgabe: Vergleiche Memory-based IBCF und Modell-based SVD Recommenders bezĂĽglich Ăśberschneidung ihrer Top-N Empfehlungen fĂĽr die User-Item Matrix des reduzierten Datensatzes (Basis: IBCF mit 30 Nachbarn und Cosine Similarity). Vergleiche wie sich der Anteil ĂĽbereinstimmender Empfehlungen der Top-15 Liste fĂĽr IBCF vs verschiedene SVD Modelle verändert, wenn die Anzahl der Singulärwerte fĂĽr SVD von 10 auf 20, 30, 40, 50 verändert wird.
# SVD MODEL
model_SVD_10 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=10))
model_SVD_20 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=20))
model_SVD_30 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=30))
model_SVD_40 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=40))
model_SVD_50 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=50))
# evaluate recommendations on "unknown" ratings
acc_SVD_10 <- calcPredictionAccuracy(predict(object = model_SVD_10, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=15,goodRating = 4)
acc_SVD_20 <- calcPredictionAccuracy(predict(object = model_SVD_20, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=15,goodRating = 4)
acc_SVD_30 <- calcPredictionAccuracy(predict(object = model_SVD_30, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=15,goodRating = 4)
acc_SVD_40 <- calcPredictionAccuracy(predict(object = model_SVD_40, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=15,goodRating = 4)
acc_SVD_50 <- calcPredictionAccuracy(predict(object = model_SVD_50, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=15,goodRating = 4)
acc_IBCF_top15 <- calcPredictionAccuracy(predict(object = model_IBCF, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=15,goodRating = 4)
acc_SVD_IB <- rbind(acc_SVD_10,acc_SVD_20,acc_SVD_30,acc_SVD_40,acc_SVD_50,acc_IBCF_top15)
rownames(acc_SVD_IB) <- c("SVD_k_10","SVD_k_20","SVD_k_30","SVD_k_40","SVD_k_50","IBCF_cos_k_30")
#acc_SVD_IB <- as.data.frame(acc_SVD_IB) %>% mutate(.,models = c("SVD_10","SVD_20","SVD_30","SVD_40","SVD_50","IBCF_cos_30"))
acc_SVD_IB <- as.data.frame(acc_SVD_IB) %>% mutate(f1_score = 2*precision*recall/(precision + recall))%>% select(-c(TP,FP,FN,TN,N,TPR,FPR))
acc_SVD_IB$model <- c("SVD_10","SVD_20","SVD_30","SVD_40","SVD_50","IBCF_cos_30")
acc_SVD_IB
metrics_long <- acc_SVD_IB  %>% pivot_longer(names_to = "metrics", values_to = "Value", precision:f1_score)
ggplot(metrics_long, aes(model, Value, fill = metrics)) + 
  geom_col(position = "dodge")+
  labs(title="Compare SVD and IBCF models") + 
  theme(plot.title = element_text(hjust = 0.5))

  • All five SVD models with different singular values have very similiar metrics values.
  • The IBCF model with cosine similarity and 30 neighbors has slightly higher f1-score than all five SVD models.

8 Implementierung Top-N Metriken

  • Implementiere Funktionen, um aus Top-N Listen fĂĽr alle Kunden die Item-space und eines Recommenders zu beurteilen.
  • Teste diese System-Metriken mit IBCF Recommendations fĂĽr Top-N Listen der Länge N = 5, 10, 15, 20, 25, 30 (Basis: reduzierter Datensatz).
calc_topn_metrics <- function(mx,split_meth,split_ratio,split_k,nr,meth,para){  
  # mx: U_I data; split_meth: 'cross' or 'split'; split_ratio: 0.8 for example; split_k: an integer, 1 for split, other numbers for cross-validation ; nr: Top-N; meth: model method,e.g. "IBCF"; para:parameter for models
  rrm <- as(mx,"realRatingMatrix")
  # split train, test-known, test_unknown data
  set.seed(1965)
  train_test <- evaluationScheme(rrm, method=split_meth, train=split_ratio, k=split_k, given=15,goodRating=4)
  rrm_train <- getData(train_test,"train")
  rrm_known <- getData(train_test,"known")
  rrm_unknown <- getData(train_test,"unknown")
  # model
  model <-Recommender(data = rrm_train,method=meth,parameter=para)
  # predict Top-N recommendation list on "known" test set
  pred_test_user<- predict(object = model, newdata = rrm_known, n = nr,type="topNList")
  TOP_N_list <- sapply(pred_test_user@items, function(x) {colnames(as(rrm_known,"matrix"))[x]}) # top-n list for all test users
  ####################################
  ### accuracy: evaluate the recommendations on "unknown" test set
  acc_ <- calcPredictionAccuracy(pred_test_user,rrm_unknown,given=15,goodRating = 4)
  acc_ <- t(as.data.frame(acc_))
  rownames(acc_) <- NULL
  
  ####################################
  ### item-space coverage: refers to the proportion of items that the recommendation system can recommend. The simplest measure is the percentage of all films that in the top-n recommendation lists divided by all potential films.
  # predict top-n lists of all user
  TOP_N_list <- sapply(pred_test_user@items, function(x) {colnames(as(rrm_known,"matrix"))[x]})
  # unique predicted film list of all test users
  uniq_film_test <- reshape2::melt(as(TOP_N_list,"matrix")) %>% rename(UserID = Var2, rank = Var1, Film_name = value)%>%distinct(Film_name)   # unique film list recommended in test data
  # unique film list of data
  uniq_film <- as.data.frame(t(mx))
  uniq_film$cnt <- rowSums(!is.na(uniq_film)) # count not NA for each film
  uniq_film <- uniq_film %>% filter(cnt>0)  # remove the film without any ratings
  # calculate the item-space coverage
  coverage <- dim(uniq_film_test)[1] / dim(uniq_film)[1] # the coverage
  coverage_table <- as.data.frame(coverage)
  colnames(coverage_table) <- "coverage"
  
  ######################################
  ### novelty for a given user: recommended movies that the user did not know about. 
  novelty_table <- data.frame() # an empty dataframe, will be filled with novelty values
  
  # extract the full test set (with known and unknown ratings)
  a <- (as.data.frame(as(rrm_known,'matrix')))
  df_test <- as.data.frame(mx) %>% mutate(as.data.frame(mx),idx=rownames(as.data.frame(mx))) 
  a <- a %>% mutate(a, idx =  rownames(a)) #%>% select(index)
  df_test <- semi_join(df_test,a,by=c("idx"="idx")) %>% select(.,-c(idx)) # full test set
  
  # df_i: replace the not NA values to the corresponding column name,
  # df_i contains all films that user i has rated, or say the user i has known. 
  for(i in 1:dim(df_test)[1]){
    df_i <- as.data.frame(t(df_test))#
    df_i$Film <- colnames(df_test) # set index (film names) as a new column 'Film'
    df_i <- df_i[,c(i,(dim(df_test)[1]+1))] %>% filter(complete.cases(.)) # list of user-i rated films
    df_i$Film <- rownames(df_i) # add a new column with the same content of rownames
    df_top_n <- as.data.frame(TOP_N_list[,i])   # top-n list of user-i
    colnames(df_top_n) <- "Film"
    df_cross <- inner_join(df_i, df_top_n, by="Film")  # inner join the two dataset, we get the recommended films that user already known.
    # calculate novelty
    novelty <- 1 - dim(df_cross)[1]/nr  # novelty value of user-i
    novelty_table <- rbind(novelty_table, novelty)
  }
  
  novelty_table$UserID <- rownames(df_test)
  colnames(novelty_table) <- c("novelty","UserID")
  novelty_table <- novelty_table %>% select(UserID,novelty) # novelty table
  
  metrics<- as.data.frame(acc_) %>% mutate(coverage = mean(coverage_table$coverage),novelty = mean(novelty_table$novelty), f1_score = 2*precision*recall/(precision + recall),Top_N = nr) %>% select(-c(N,TN,FN,FP,TP,TPR,FPR))
  return(metrics)
}
N_lst = c(5, 10, 15, 20, 25, 30)
metrics <- data.frame()
for(nr in N_lst){metrics <- bind_rows(metrics,calc_topn_metrics(mx_reduced,'split',0.8,1,nr,"IBCF",list(normalize = "Z-score",method="Cosine",k=30)))}
metrics$models <- 'IBCF'
metrics
metrics_8_long <- metrics %>% pivot_longer(names_to = "metrics", values_to = "Value", precision:f1_score)
ggplot(metrics_8_long, aes(x=Top_N, y = Value,  group = metrics, colour=metrics)) + 
  geom_line() + 
  geom_point()+
  labs(title="IBCF metrics with different N values") + 
  theme(plot.title = element_text(hjust = 0.5))

  • Using IBCF with 30 neighbours and reduced data, I calculated the item-space coverage and novelty of Top-5, 10, 15, 20, 25, 30 lists of all users.
  • The item-space is calculated by the number of all recommended films (for all users) divided by all the potential films in the dataset. Coverage reflects the ability of recommender system to recommend all items from a train set to users, or in another word.
    • If coverage is very low, it means the recommender system is recommending users only a small group of items (for example, possibly only the top popular items are recommended).
    • Low coverage can lead to users’ dissatisfaction.
    • Here we could see that when increasing N value, the coverage increased, which totally makes sense. Because the longer the recommending list is, the system could recommend more films to each user, therefore the chance is higher that more films appear in the Top-N list.
  • The explains how many percentage unknown films are recommened to users, in another word, it measures surprise. But here I could only consider that for a given user, the rated films are known, not-rated films are unknown. But I could not find out if the user has known the not-rated films before.
    • It is calculated by the recommended unknown films of a user divided by N.
    • When increase the length of Top-N list, the novelty also increased, but it increased slower than coverage.
  • Additionally when using longer Top-N list, the precision decreased, recall increased, the f1-score also increased.

Why use different metrics to evaluate recommender system?

  • Accuracy tells if a recommender system is able to predict items that an user has already rated. But recommender system is not all about accuracy, metrics like coverage, novelty, diversity are also very important.
  • Recommender system should keep the balance of the items that user might like and items that user do not know but could discover in the recommendation list. Because familiar items will let customers trust the recommender system, unfamiliar items will let them discover new things.

9 Wahl des optimalen Recommenders

  • Bestimme aus 5 unterschiedlichen Modellen das fĂĽr Top-N Empfehlungen “beste” Modell und verwende zusätzlich einen Top- Movie Recommender (Basis: reduzierter Datensatz).

9.1 Verwende fĂĽr die Evaluierung 10-fache Kreuzvalidierung

model_lst = c('IBCF','UBCF','SVD','RANDOM','LIBMF')
para_lst = c(list(normalize = "Z-score",method="Cosine",k=30),list(normalize = "Z-score",method="cosine",nn=30),list(normalize = "Z-score",k=30),NULL,NULL)
metrics_91 <- data.frame()
for(i in 1:5){metrics_91 <- bind_rows(metrics_91,calc_topn_metrics(mx_reduced,'cross',0.8,10,15,model_lst[i],para_lst[i]))}
## Warning: Unknown parameter: method
## Available parameter (with default values):
## dim   =  10
## costp_l2  =  0.01
## costq_l2  =  0.01
## nthread   =  1
## verbose   =  FALSE
metrics_91$models <- c('IBCF','UBCF','SVD','RANDOM','LIBMF')
rownames(metrics_91) <- metrics_91$models
metrics_91
metrics_long <- metrics_91 %>% pivot_longer(names_to = "metrics", values_to = "Value", coverage:f1_score)
ggplot(metrics_long, aes(models, Value, fill = metrics)) + 
  geom_col(position = "dodge")+
  labs(title="Compare 5 models with 10-fold cross-validation") + 
  theme(plot.title = element_text(hjust = 0.5))

9.2 BegrĂĽnde meine Wahl der Performance Metrik,

  • f1-score is a harmonic accuracy calculated by precision and recall. Higher precision means that an algorithm returns more relevant results than irrelevant ones. high recall means that an algorithm returns most of the relevant results (whether or not irrelevant ones are also returned). A perfect precision score of 1.0 means that every result retrieved was relevant (but says nothing about whether all relevant documents were retrieved) whereas a perfect recall score of 1.0 means that all relevant documents were retrieved by the search (but says nothing about how many irrelevant documents were also retrieved)
    • IBCF, SVD and LIBMF have much better f1-score than UBCF and RANDOM.
  • coverage is metric to present how many percentage of items are recommended. The users will trust the system when they get recommendations which they already know.
    • RANDOM, IBCF, and UBCF have better coverage than other two.
  • novelty is metric to present how many percentage of recommended items are unknown to the user. It brings surprise to users, and potential to sell products to more new users.
  • A good balance between coverage and novelty is very important, so that users will trust the system and explore new products.
    • All five models have relative high novelty, in the range 0.68 and 0.86. UBCF and RANDOM are a little bit better than the other three models.
  • Together, IBCF has a good balance in the three metrics coverage, novelty and f1-score. Therefore, I will select IBCF for further parameter tuning.

9.3 Analysiere das beste Modell fĂĽr Top-N Recommendations mit N gleich 10, 15, 20, 25 und 30,

N_lst = c(10,15,20,25,30)
metrics_93 <- data.frame()
for(i in 1:5){metrics_93 <- bind_rows(metrics_93,calc_topn_metrics(mx_reduced,'cross',0.8,10,N_lst[i],'IBCF',list(normalize = "Z-score",method="Cosine",k=30)))}
metrics_93$models <- 'IBCF'
metrics_93
metrics_93_long <- metrics_93 %>% pivot_longer(names_to = "metrics", values_to = "Value", precision:f1_score)
ggplot(metrics_93_long, aes(x=Top_N, y = Value,  group = metrics, colour=metrics)) + 
  geom_line() + 
  geom_point()+
  labs(title="Best model (IBCF) with different N values") + 
  theme(plot.title = element_text(hjust = 0.5))

  • When increase the length of recommendation list, the coverage increases from 0.22 to 0.47,

  • all other metrics have smaller increase or decrease (precision).

9.4 Optimiere mein bestes Modell hinsichtlich Hyperparameter.

  • Hinweis: Verwende fĂĽr den Top-Movie Recommender die Filme mit den höchsten Durchschnittsratings.
# films with the highest average ratings ( avg_ratings > 3)
df_top_avg <- as.data.frame(t(df_reduced))
df_top_avg <- df_top_avg %>% mutate(avg_rating = rowMeans(df_top_avg,na.rm=TRUE,dims=1))%>% arrange(desc(avg_rating))%>% filter(avg_rating>3) %>% select(-avg_rating)
mx_top_avg <- t(df_top_avg)
dim(mx_top_avg)
## [1] 400 566
  • Filter the films with average ratings > 3.

  • The new dataframe has 566 films.

  • Now I would like to tune hyperparameters of model IBCF:

    • similarity methods
    • number of neighbours
    • normalization methods
# hyperparameter tuning
k_neighbor_lst <- c(15,20,30,50,100)
normalize_lst <- c('center','Z-score')
similarity_lst <- c('cosine','Pearson')
metrics_94 <- data.frame()
for(i in 1:length(normalize_lst)){
  for(j in 1:length(similarity_lst)){
    for(m in 1:length(k_neighbor_lst)){
        metrics_new <- calc_topn_metrics(mx_top_avg,'cross',0.8,10,20,'IBCF',list(normalize = normalize_lst[i],method=similarity_lst[j],k=k_neighbor_lst[m]))
        metrics_new$normalize <- normalize_lst[i]
        metrics_new$similarity<- similarity_lst[j]
        metrics_new$k_neighbor <- k_neighbor_lst[m]
        metrics_94 <-bind_rows(metrics_94,metrics_new)}}}
metrics_94 <- metrics_94 %>% select(-Top_N)
metrics_94
metrics_94_long <- metrics_94 %>% pivot_longer(names_to = "metrics", values_to = "Value", precision:f1_score)
ggplot(metrics_94_long, aes(x=k_neighbor, y = Value,  group = metrics, colour=metrics)) + 
  geom_line() + 
  geom_point() +
  facet_grid(cols=vars(normalize),rows=vars(similarity)) + 
  labs(title="Hyperparameter tuning of best(IBCF) model") + 
  theme(plot.title = element_text(hjust = 0.5))

  • Here I tried to optimize the best (IBCF) model through three parameters normalization, similarity, k neighbours.

  • The two normaliazation methods center and z-score have very similiar performance on the metrics.

  • With Pearson similarity, the coverage is higher. When k = 15 or k = 20, the novelty values are also larger; With cosine similarity, the precision values are larger when k < 100.

  • k_neighbour: coverage and novelty values are largest with k = 100. Precision, recall, f1-value have largest values at around k=30 or k = 20.

  • Finally for IBCF model I choose the parameters “k = 30, center normalization, cosine similarity”, as with this combination IBCF model has a good balance of metrics coverage, novelty, and f1-score.

best_IBCF <- metrics_94 %>% filter(.,normalize=="center"& similarity == "cosine" & k_neighbor == 30)
best_IBCF

10 Implementierung Top-N Monitor Aufgabe DIY:

  • Untersuche die relative Ăśbereinstimmung zwischen Top-N Empfehlungen und präferierten Filmen fĂĽr 4 unterschiedliche Modelle (z.B. IBCF und UBCF mit unterschiedlichen Ă„hnlichkeits- metriken / Nachbarschaften sowie SVD mit unterschiedlicher Dimensionalitätsreduktion).

10.1 Fixiere 20 zufällig gewählte Testkunden für alle Modellvergleiche,

  • while we have 400 users in the dataset with top rated films, I will split train-test set with ratio of 0.95, so that the test set will have exactly 20 users.
  • I will compare IBCF, UBCF, SVD, SVDF (based on Funk SVD with gradient descend) models with different parameters
# use split ratio = 0.95, will get exactly 20 users in test set
neighbor_lst <- c(15,20,30,50,100) # k neighbours parameter
normalize_lst <- c('center','Z-score') # normalization methods
similarity_lst <- c('cosine','Pearson') # similarity methods for ordinal data
k_dimred <- c(10,20,30,40,50)  #dimension reduction parameter for model SVD
# IBCF UBCF and SVD
metrics_IBCF <- data.frame()
metrics_UBCF <- data.frame()
metrics_SVD <- data.frame()
metrics_SVDF <- data.frame()
for(i in 1:length(normalize_lst)){
  for(m in 1:length(neighbor_lst)){ 
    metrics_new_SVD <- calc_topn_metrics(mx_top_avg,'split',0.95,1,20,'SVD',list(normalize = normalize_lst[i],k=k_dimred[m]))
    metrics_new_SVD$normalize <- normalize_lst[i]
    metrics_new_SVD$k_dim <- k_dimred[m]
    metrics_new_SVD$model <- 'SVD'
    metrics_SVD <-bind_rows(metrics_SVD,metrics_new_SVD)
    
    metrics_new_SVDF <- calc_topn_metrics(mx_top_avg,'split',0.95,1,20,'SVDF',list(normalize = normalize_lst[i],k=k_dimred[m]))
    metrics_new_SVDF$normalize <- normalize_lst[i]
    metrics_new_SVDF$k_dim <- k_dimred[m]
    metrics_new_SVDF$model <- 'SVDF'
    metrics_SVDF <-bind_rows(metrics_SVDF,metrics_new_SVDF)
    
    for(j in 1:length(similarity_lst)){
        metrics_new_IBCF<-calc_topn_metrics(mx_top_avg,'split',0.95,1,20,'IBCF',list(normalize = normalize_lst[i],method=similarity_lst[j],k=neighbor_lst[m]))
        metrics_new_IBCF$normalize <- normalize_lst[i]
        metrics_new_IBCF$similarity<- similarity_lst[j]
        metrics_new_IBCF$k_neighbor <- k_neighbor_lst[m]
        metrics_new_IBCF$model <- 'IBCF'
        metrics_IBCF <-bind_rows(metrics_IBCF,metrics_new_IBCF)
        
        metrics_new_UBCF <-calc_topn_metrics(mx_top_avg,'split',0.95,1,20,'UBCF',list(normalize = normalize_lst[i],method=similarity_lst[j],nn=neighbor_lst[m]))
        metrics_new_UBCF$normalize <- normalize_lst[i]
        metrics_new_UBCF$similarity<- similarity_lst[j]
        metrics_new_UBCF$nn_neighbor <- neighbor_lst[m]
        metrics_new_UBCF$model <- 'UBCF'
        metrics_UBCF <-bind_rows(metrics_UBCF,metrics_new_UBCF)
        }}}
metrics_IBCF <- metrics_IBCF %>% select(-Top_N)
metrics_UBCF <- metrics_UBCF %>% select(-Top_N)
metrics_SVD <- metrics_SVD %>% select(-Top_N)
metrics_SVDF <- metrics_SVDF %>% select(-Top_N)
metrics_IBCF
metrics_UBCF
metrics_SVD
metrics_SVDF
metrics_IBCF_long <- metrics_IBCF %>% pivot_longer(names_to = "metrics", values_to = "Value", precision:f1_score)
ggplot(metrics_IBCF_long, aes(x=k_neighbor, y = Value,  group = metrics, colour=metrics)) + 
  geom_line() + 
  geom_point() +
  facet_grid(cols=vars(normalize),rows=vars(similarity)) + 
  labs(title="Hyperparameter tuning of IBCF model (20 test users)", x="k neighbors") + 
  theme(plot.title = element_text(hjust = 0.5))

  • IBCF models hyperparameter tuning result analysis
    • novelty is always larger than 0.5 with any parameter combinations
    • With pearson similarity, f1_score and coverage values vary oppositely when increasing k.
    • With cosine similarity, f1_score and novelty vary oppositely when increasing k.
    • With center-nomalization, cosine similarity, and k = 20, the IBCF has a good balance of all metrics.
metrics_UBCF_long <- metrics_UBCF %>% pivot_longer(names_to = "metrics", values_to = "Value", precision:f1_score)
ggplot(metrics_UBCF_long, aes(x=nn_neighbor, y = Value,  group = metrics, colour=metrics)) + 
  geom_line() + 
  geom_point() +
  facet_grid(cols=vars(normalize),rows=vars(similarity)) + 
  labs(title="Hyperparameter tuning of UBCF model (20 test users)",x="nn neighbors") + 
  theme(plot.title = element_text(hjust = 0.5))

  • UBCF models hyperparameter tuning result analysis
    • novelty is always larger than 0.6 with any parameter combinations
    • f1_score and novelty vary oppositely when increasing k.
    • With nn = 50, all metrics reach a good balance. With Z-score and cosine, the coverage and f1_score are a little bit better than the other three.
    • Finally, with Z-score-nomalization, cosine similarity, and nn = 50, the UBCF has a good balance of all metrics.
metrics_SVD_long <- metrics_SVD %>% pivot_longer(names_to = "metrics", values_to = "Value", precision:f1_score)
g1 = ggplot(metrics_SVD_long, aes(x=k_dim, y = Value,  group = metrics, colour=metrics)) + 
  geom_line() + 
  geom_point() +
  facet_grid(cols=vars(normalize)) + 
  labs(title="Hyperparameter tuning of SVD model (20 test users)",x="k dimension reduction") + 
  theme(plot.title = element_text(hjust = 0.5))

metrics_SVDF_long <- metrics_SVDF %>% pivot_longer(names_to = "metrics", values_to = "Value", precision:f1_score)
g2 = ggplot(metrics_SVDF_long, aes(x=k_dim, y = Value,  group = metrics, colour=metrics)) + 
  geom_line() + 
  geom_point() +
  facet_grid(cols=vars(normalize)) + 
  labs(title="Hyperparameter tuning of SVDF model (20 test users)",x="k dimension reduction") + 
  theme(plot.title = element_text(hjust = 0.5))
grid.arrange(g1, g2, nrow = 2)

  • SVD models hyperparameter tuning result analysis
    • The metric values don’t vary much with different parameter combinations.
    • SVD models have very small coverage compare to IBCF and UBCF models. This means the SVD models are recommending much smaller part of films to users. The recommendations may be less personalized.
    • I will simply choose the combination with highest coverage, which is with center normalization and k = 50.
  • SVDF models hyperparameter tuning result analysis
    • novelty values are overall larger than 0.49.
    • precision values are larger than 0.34.
    • coverage values are smaller than 0.24, and decrease when increasing k values. The coverage values are also smaller than IBCF and UBCF, but larger than SVD.
    • With the combination of center normalization and k = 10, the SVDF has the best balance of all metrics.
  • merge the four best models in one table
best_IBCF <- metrics_IBCF %>% filter(.,normalize=="center"& similarity == "cosine" & k_neighbor == 20)%>%select(c(precision,recall,coverage,novelty,f1_score,model))
best_UBCF <- metrics_UBCF %>% filter(.,normalize=="Z-score"& similarity == "cosine" & nn_neighbor == 50)%>%select(c(precision,recall,coverage,novelty,f1_score,model))
best_SVD <- metrics_SVD %>% filter(.,normalize=="center"& k_dim == 50)%>%select(c(precision,recall,coverage,novelty,f1_score,model))
best_SVDF <- metrics_SVDF %>% filter(.,normalize=="center"& k_dim == 10)%>%select(c(precision,recall,coverage,novelty,f1_score,model))
best_models <- rbind(best_IBCF,best_UBCF,best_SVD,best_SVDF)
best_models
best_models_long <- best_models %>% pivot_longer(names_to = "metrics", values_to = "Value", precision:f1_score)
ggplot(best_models_long, aes(model, Value, fill = metrics)) + 
  geom_col(position = "dodge")+
  labs(title="Compare 4 best models (20 test users)") + 
  theme(plot.title = element_text(hjust = 0.5))

  • Over four best models, the IBCF has the best balance of all metrics.
  • So I will select IBCF model with center-nomalization, cosine similarity, and k = 20 for my RS.

10.2 Bestimme den Anteil der Top-N Empfehlung nach Genres pro Kunde,

  • the function Top_n_genre will return a table, where each row represents one user, each column represents one genre, the values are the genre percentage of top-15 recommendations per test user (each row should have the sum of 1).
Top_n_genre <- function(Top_n_list,n){   # Top_n_list: top-n matrix; n: the "n" in top-n
  table = data.frame()
  for(i in 1:dim(Top_n_list)[2]){
    df_top <- as.data.frame(Top_n_list[,i])
    colnames(df_top) <- "Film"
    df_film_genre_1 <- as.data.frame(mx_film_genre)
    df_film_genre_1$Film <- rownames(df_film_genre_1)
    df_top <- left_join(df_top,df_film_genre_1,by=("Film")) %>% select(-Film)
    df_top <- df_top[-1,]
    total <- sum(df_top)
    df_top["ratio",] <- colSums(df_top)/total
    df_top <- df_top["ratio",]
    table <- rbind(table,df_top)
  }
  rownames(table) <- colnames(Top_n_list)
  return(table)
}
# use the same split setting as in the last task to make sure that it is always the same train set, same 20 test users. 
set.seed(1965)
train_test_10 <- evaluationScheme(as(mx_top_avg,"realRatingMatrix"), method="split", train=0.95, k=1, given=15,goodRating=4) 

# training dataset has 380 users,test dataset has 20 users 
# given=15: For each test user, 15 films per user will be used for prediction, the rest for evaluation)
rrm_reduced_train_10 <- getData(train_test_10,"train")
rrm_reduced_known_10 <- getData(train_test_10,"known") 
rrm_reduced_unknown_10 <- getData(train_test_10,"unknown")

# Use the best model (selected in the last task): ICBF models with center normalization, cosine similarity, and 20 neighbors
model_IBCF <-Recommender(data = rrm_reduced_train_10,method="IBCF",parameter=list(normalize = "center",method="cosine",k=20))

# Top-15 recommendation list of 
Pred_ <- predict(object = model_IBCF, newdata = rrm_reduced_known_10, n = 15,type=c("topNList"))
Top15_ <- sapply(Pred_@items, function(x) {colnames(mx_top_avg)[x]})
Top_15_recommendations <- Top_n_genre(Top15_,15) # the percentage of top-15 recommendations by genre per test user
rownames(Top_15_recommendations) <- rownames(as.data.frame(as(rrm_reduced_known_10,"matrix")))
Top_15_recommendations 
# check if the table works correct. Each row should have the sum of 1
rowSums(Top_15_recommendations) 
## 655 303 234   7 268 194 374 868 840 807 189 488 608 763 197 253 466 523 871 159 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
ggplot(stack(Top_15_recommendations), aes(x = reorder(ind,-values), y = values)) +
  geom_boxplot() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),plot.title = element_text(hjust = 0.5)) + labs(x=NULL,y="Proportion",title = "Genre proportion in Top-15 recommendations per user")

  • In this part, the goal is to analyse the genre proportion of the predicted films per user.

  • In the table, each row represents the genre proportion in the top-15 recommendation list of one user. This means the sum of each row should be 1.

  • In the short calculation under the table, we could see that all rowsum are equal to 1. This demonstrates that proportion table is correctly calculated.

  • The boxplot showed us the proportion of all 18 film genres. Each box includes the data of 20 test users.

  • We could see that Drama has the highest proportion in top-15 recommendations. And Drama genre is the only genre which is recommended to all 20 users (there is no zero proportion), the smallest proportion is about 0.07, the highest proportion is about 0.44.

  • Animation, Children’s, Documentary, Fantasy and Musical have the least proportion. And they appeared only in very few users’ top-15 recommendations.

10.3 Bestimme pro Kunde den Anteil nach Genres seiner Top-Filme

  • (=Filme, welche vom Kunden die besten Bewertungen erhalten haben)

  • Now let’s have a look, how the genres proportion in the true data per user.

# firstly, select the twenty users which are used in the last two tasks, with the top average-ratings films.
mx_users_20 <- mx_top_avg[c(rownames(Top_15_recommendations)),] 
as.data.frame(mx_users_20)
  • the function Top_film_genre will use the true rating data of the users as input.
  • return the genre proportions of the top-n rated films (true favourite films) per user.
Top_film_genre <- function(mx,n){   # mx is user-item matrix, n: top-n
  table = data.frame()
  for(i in 1:dim(mx)[1]){ # for each user
    df_top <- as.data.frame(t(mx))
    df_top$Film <- rownames(df_top)
    df_top <- df_top[,c(i,(dim(mx)[1]+1))]
    colnames(df_top) <- c("Ratings","Film")
    # filter only the films with ratings -> sort by ratings -> select the top-n rated films
    df_top <- df_top %>% filter(complete.cases(.)) %>% arrange(desc(Ratings)) %>% slice(1:n)
    # use the data film_genre, left join to the top-n rated films
    # then will get 18 genre columns, with the value of 1 or 0 (because each film could have many genres)
    df_film_genre_1 <- as.data.frame(mx_film_genre)
    df_film_genre_1$Film <- rownames(df_film_genre_1)
    df_left_join <- left_join(df_top,df_film_genre_1,by=("Film"))
    df_top <- df_left_join[,-(1:2)]
    total <- sum(df_top) # total genres involved in the top-n rated films
    # generate a new row called "ratio", with the proportion of each genre
    df_top["ratio",] <- colSums(df_top)/total
    df_top <- df_top["ratio",] # keep only the ratio row
    table <- rbind(table,df_top) # bind this row to the output table
  }
  rownames(table) <- rownames(mx)
  return(table)
}
Top_15_films <- Top_film_genre(mx_users_20,15) # genres proportion of the top 15 films for every user
Top_15_films
rowSums(Top_15_films)
## 655 303 234   7 268 194 374 868 840 807 189 488 608 763 197 253 466 523 871 159 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
ggplot(stack(Top_15_films), aes(x = reorder(ind,-values), y = values)) +
  geom_boxplot() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),plot.title = element_text(hjust = 0.5)) + labs(x=NULL,y="Proportion",title = "Genre proportion in Top-15 favourite films per user")

  • The first table shows the genre proportion in the top-15 rated films per user.

    • The top-15 rated films per user were analysed and calculted the corresponding genre proportion. The sum of each user should be
  • The second brief calculation is the sum of each row. The result shows that all are 1, which indicates that the proportion table is correctly calculated.

  • The box-plot shows proportion of all the 18 genres. Each box includes the proportion of 20 test users.

    • Drama has the highest medium, with the values between 0.08 and 0.47.
    • The genres Drama, Action, Romance, Thriller have no zero values. This means, all the twenty users have high ratings to the films with four genres.
    • Animation, Children’s, Documentary, Fantasy, Musical, Western have the least medium of 0. They are not so popular in the selected 20 users.

10.4 Vergleiche pro Kunde Top-Empfehlungen und Top-Filmen nach Genres,

  • Now it is time to check the differences between the system recommendations by genres and the users’ real favourite genres. Or in short, how good are the predictions?
# combine the predicted and true tables with labels. then convert it to a long table. 
top_1 <- Top_15_films
top_1$type <- 'predicted'
top_2 <- Top_15_recommendations
top_2$type <- 'true'
top_bind <- rbind(top_1,top_2)
top_bind_long <- melt(top_bind, id = "type")

# plot 
ggplot(top_bind_long, aes(x = reorder(variable,-value), y = value,color=type)) +
  geom_boxplot() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),plot.title = element_text(hjust = 0.5)) + labs(y="Proportion",x = "Genres",title = "Genre proportion of Top-15 recommendations and Top-15 favourites per user") 

  • Drama has the highest medium proportion in both true and predicted groups.

  • Animation, Children’s, Documentary, Fantasy, Musical have the least proportion in both groups

  • Most of the genres have similiar distributions in both groups with a small differences except Western.

  • Western has much higher proportions in the users’ true favourite lists than in the prediction. This means some users in this group like Western, but they get less Western films recommended than expected.

  • evaluate the Top-recommendations comparing to Top_films by quantitative metrics MAE(mean average error,more robust than MSE), MSE(mean squared error,more sensitive to outliers than MAE), RMSE(the root mean squared error).

  • The more robust metric MAE has the medium of 0.034.

df_diff = Top_15_films - Top_15_recommendations
# calculate the mean absolute error
MAE_top_genre <- rowSums(abs(df_diff))/20
MSE_top_genre <- rowSums(df_diff^2)/20
RMSE_top_genre <- sqrt(rowSums(df_diff^2)/20)
quantitative_metrics <- t(rbind(MAE_top_genre,MSE_top_genre,RMSE_top_genre))
quantitative_metrics <- as.data.frame(quantitative_metrics)
colnames(quantitative_metrics) <- c("MAE","MSE","RMSE")
# plot
ggplot(stack(quantitative_metrics), aes(x = reorder(ind,values), y = values)) +
  geom_boxplot() + theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust=1),plot.title = element_text(hjust = 0.5)) + labs(x=NULL,y=NULL,title = "Quantitative metrics of Top-recommendations vs Top-films")

10.5 Definiere eine Qualitätsmetrik für Top-N Listen und teste sie.

  • I will use a ranking metric MAP (Average Precision and Mean Average Precision) to qualitatively evaluate the Top-n recommendations.

  • It works like this:

    • for each user, the top-N1 genres will be labeled as relavant genres, others are non-relavant genres. For example, if choose 6, the top-6 true genres will be labeled as relavant, others as non-relavant.

    • set the cut-off N2: evaluate the top-N2 genres in the recommendation list.

    • Now go to the prediction list (the top-recommendations), sort the genres by prediction rankings (rank 1, 2, 3,….).

    • Scan the N2 best ranked items in prediction list. Find the prediction positions (prediction ranks) of relavant genres.

    • calculate the average precision of this user like this: mean ( 1/position1 + 2/position2 + 3/position3 + …).

    • Some users may have more relavant genres, some less.

  • then calculate the mean of average-precisions (MAP) from all users.

  • But MAP metric has the drawback that it does not consider the recommended list as an ordered list.

  • I will define two functions to calculate the average-precisions of all users and the average of average-precisions of all users.

  • define a function avg_precision, which calculates the average precision of one user. This function will be applied in the function MAP.

# calculate average precision of one user. The function will be applied in MAP function
# input: dataframe with columns of rank_true, rank_pred, genres
# N1: max number of possible relavant items
# N2: scan the top-N2 recommendations
avg_precision <- function(df,N1,N2){
  count_relv = 0
  sum_relv = 0
  for(i in 1:N2){
    row_i <- df[i,]
    if (row_i["rank_true"] <= N1){# this means it is a relavant genre
      count_relv = count_relv + 1
      sum_relv = sum_relv + count_relv/i
    }}
  return(sum_relv/count_relv)}
  • calculate the Mean Average Precision of many users given top_n_films and top_n_predictions.
# function MAP will calculate the average precision of all users.
# input: top_n_films, top_n_predictions, N1(how many relavant items), N2 (scan the top-N2 items in predictions)
# output: a dataframe with the average precisions of all users.
MAP <- function(top_true,top_pred,N1,N2){
  avg_precision_list = NULL # the list used to collection the average-precision of all users
  for(i in 1:dim(top_true)[1]){ # each time analyse one user
    # transpose
    top_true_i <- as.data.frame(t(top_true[i,])) 
    top_pred_i <- as.data.frame(t(top_pred[i,]))
    # rename the column
    colnames(top_true_i) <- 'proportion'
    colnames(top_pred_i) <- 'proportion'
    # sort by value
    top_true_i <- top_true_i %>% arrange(.,desc(proportion))%>%select(-proportion)
    top_pred_i <- top_pred_i %>% arrange(.,desc(proportion))%>%select(-proportion)
    # generate new column "rank"
    top_true_i$rank_true <- 1:18
    top_pred_i$rank_pred <- 1:18
    # copy rownames as a column "genres", drop rownames
    top_true_i$genres <- rownames(top_true_i)
    top_pred_i$genres <- rownames(top_pred_i)
    rownames(top_true_i)<-NULL
    rownames(top_pred_i)<-NULL
    # left-join top_true_i to top_pred_i
    top_join <- left_join(top_pred_i,top_true_i,by="genres")
    avg_precision_i <- avg_precision(top_join,N1,N2)
    avg_precision_list <- c(avg_precision_list,avg_precision_i)}
  df_ap <- as.data.frame(avg_precision_list)
  rownames(df_ap ) <- rownames(top_true)
  colnames(df_ap) <- 'average_precision'
  return(df_ap)
  }
top_true <- Top_15_films 
top_pred <- Top_15_recommendations
result_105 <- MAP(top_true,top_pred,5,5) %>% arrange(.,desc(average_precision))
result_105
barplot(height = result_105[,1],xlab="userID",ylab="average precision",names.arg=c(rownames(result_105)),las=2)

print(paste("mean of average-precision (MAP@5) is", colMeans(result_105)))
## [1] "mean of average-precision (MAP@5) is 0.82625"
  • The recommender system has a relatively good MAP of 0.83, where the (average-precision) of the 20 users are between 0.37 and 1.0.
  • The user 303 871 have the of 1, the user 523 has the smallest of 0.37
  • This means the system could recommend users’ top-5 favorite genres with precision of 83%.

11 Summary

metrics_by_film <- best_models[1,] %>% select(-model)
metrics_by_film
metrics_by_genres <- as.data.frame(mean(quantitative_metrics$MAE)) %>% mutate(.,MAP=colMeans(result_105))
colnames(metrics_by_genres) <- c("avg_MAE","MAP")
metrics_by_genres
11.1 model selection
- my recommender system use IBCF model with center normalization, cosine similarity and 20 neighbors
- Through four models IBCF UBCF SVD SVDF, with different parameter combinations, my model has the best balance of metrics coverage (0.33), novelty (0.52), f1_score (0.14), precision (0.34), and recall (0.09).
11.2 recommender system evaluation
- Evaluate the recommender system by genres of the top-N recommendations and true top-N favourite films quantitatively (mean absolute error) and qualitatively (mean of average precision) 
- The predicted genres proportion and true genres proportion have showed very similiar trends, and with a low mean absolute error of 0.035.
- Evaluate the genre by ranking metric mean of average precision (MAP@5), it reaches a high MAP of 0.83.
- My recommender system could give personalized recommendation based on genres with high precision.